ISA DeepData

This notebook processes data from the International Seabed Authority (ISA) DeepData database into Darwin Core archives. The resulting datasets are hosted at https://datasets.obis.org/hosted/isa/index.html.

The code for this notebook is hosted at https://github.com/iobis/notebook-deepdata.

Data flow

Reading the data from S3

The DeepData dataset is delivered to OBIS via S3. Credentials are stored in env.txt. Earlier versions of the file were encoded in the non-standard ISO-8859-1, requiring the need to use readLines before parsing the data with the jsonlite package, but that is fixed now.

require(RCurl)
library(stringr)
library(dplyr)
library("aws.s3")

readRenviron("env.txt")
json <- get_object("uploads/isa/deepdata.json", "obis-datasets", as = "text")

Parsing the JSON file

library(jsonlite)
library(purrr)

records <- fromJSON(json, simplifyDataFrame = TRUE)$DEEPDATA %>%
  as_tibble()

Generating Darwin Core data files

We can now extract a list of distinct datasets from the data frame, and generate a Darwin Core archive for each dataset. Let’s first generate dataset identifiers from the dataset titles for later use in the dataset URLs. To account for possible future changes to dataset titles, I’m removing or fixing some words in the titles. The result should be that identifiers do not change when typos are fixed in the future.

library(knitr)

titles <- records %>%
  distinct(Metadata$title) %>%
  pull("Metadata$title")

identifiers <- titles %>%
  tolower(.) %>%
  str_replace(., "meiofaun$", "meiofauna") %>%
  str_replace(., "templaye", "template") %>%
  str_replace(., "template", "") %>%
  str_replace(., "biodiveristy", "biodiversity") %>%
  str_replace(., "macrfaun$", "macrofauna") %>%
  str_replace(., "meofauna", "meiofauna") %>%
  str_replace(., "meiobent$", "meiobenthos") %>%
  str_replace(., "-", " ") %>%
  str_squish(.) %>%
  str_replace_all(., "\\s", "_")

stopifnot(length(unique(titles)) == length(unique(identifiers)))
records$dataset_id <- identifiers[match(records$Metadata$title, titles)]

data.frame(titles, identifiers) %>%
  kable()
titles identifiers
BGRPMN12015 BIODIVERSITY bgrpmn12015_biodiversity
BGRPMN12017 Biodiveristy bgrpmn12017_biodiversity
BGRPMN12016 Biodiversity Envdata bgrpmn12016_biodiversity_envdata
BGRPMN12017 Env Template MANGAN2013 bgrpmn12017_env_mangan2013
BGRPMN12017 Env Template BIONOD2012 bgrpmn12017_env_bionod2012
RUSMNRCRFC12015 Env Template Biodata rusmnrcrfc12015_env_biodata
BGRPMN12017 Env Template MANGAN2016 bgrpmn12017_env_mangan2016
BGRPMN12017 Env Template FLUM bgrpmn12017_env_flum
BGRPMN12017 Env Template MANGAN2010 bgrpmn12017_env_mangan2010
GSRPMN12017 Env Template BIOSO239 gsrpmn12017_env_bioso239
DORDPMN12016 Mn2016 ENV dordpmn12016_mn2016_env
DORDPMN12018 Env Mn Bio dordpmn12018_env_mn_bio
IOMPMN12018 Env Template BIOL iompmn12018_env_biol
IOMPMN12015 Env Template annex 11 iompmn12015_env_annex_11
IOMPMN12014 Env Bio iompmn12014_env_bio
DORDPMN12020 Env dordpmn12020_env
BGRPMN12017 Env Template ECORESPONSE bgrpmn12017_env_ecoresponse
BGRPMN12017 Env Template MANGAN2014 bgrpmn12017_env_mangan2014
KOREAPMN12011 Env Template 2011 abundance koreapmn12011_env_2011_abundance
COMRAPMS12016 Env Template phytoplankton comrapms12016_env_phytoplankton
IOMPMN12015 Annex 1 env iompmn12015_annex_1_env
IOMPMN12015 Env Template annex 1 iompmn12015_env_annex_1
COMRAPMS12016 Env Template phytoplanton comrapms12016_env_phytoplanton
COMRAPMS12017 Env Template zooplankton comrapms12017_env_zooplankton
COMRAPMS12018 zooplankton comrapms12018_zooplankton
YUZHPMN12015 Biodata B6 yuzhpmn12015_biodata_b6
YUZPMN12016 Biodata yuzpmn12016_biodata
OMSPMN12017 Scavengers Senckenberg omspmn12017_scavengers_senckenberg
UKSRLPMN12015 Env Template Scavengers 032016 uksrlpmn12015_env_scavengers_032016
UKSRLPMN12016 NHM-UNI uksrlpmn12016_nhm_uni
UKSRLPMN12016 Senkenberg uksrlpmn12016_senkenberg
UKSRLPMN12016 Megafauna uksrlpmn12016_megafauna
UKSRLPMN12017 NHM UNI uksrlpmn12017_nhm_uni
UKSRLPMN12017 Senkenberg Macrofauna uksrlpmn12017_senkenberg_macrofauna
GSRPMN12016 MarBiol UGent gsrpmn12016_marbiol_ugent
GSRPMN12020 MarBiol UGent gsrpmn12020_marbiol_ugent
KOREAPMN12012 Env Template 2012 macrofauna koreapmn12012_env_2012_macrofauna
KOREAPMN12012 Env Template 2012 biomass koreapmn12012_env_2012_biomass
KOREAPMN12012 Meiofauna koreapmn12012_meiofauna
IFREMERPMN12015 AR ENV ifremerpmn12015_ar_env
COMRAPMN12018 Lander2017 comrapmn12018_lander2017
COMRAPMS12017 Env Template phytoplankton comrapms12017_env_phytoplankton
KOREAPMN12014 Env Template 2014 megafauna koreapmn12014_env_2014_megafauna
IFREMERPMN12018 Nodinaut Nematoda Copepoda ifremerpmn12018_nodinaut_nematoda_copepoda
JOGMECCRFC12018 Env Template HK17-01 phyto jogmeccrfc12018_env_hk17_01_phyto
JOGMECCRFC12018 Env Template JK18 picoplankton jogmeccrfc12018_env_jk18_picoplankton
JOGMECCRFC12018 Env Template HK17 NEMA jogmeccrfc12018_env_hk17_nema
JOGMECCRFC12016 Env Template 2016 nematoda DNA jogmeccrfc12016_env_2016_nematoda_dna
JOGMECCRFC120017 Env Template 2017 abundance jogmeccrfc120017_env_2017_abundance
JOGMECCRFC12016 Env Template 2016 Edokko data jogmeccrfc12016_env_2016_edokko_data
JOGMECCRFC12016 Env Template 2016 ROV jogmeccrfc12016_env_2016_rov
COMRACRFC12016 Env Template DY37 megafauna comracrfc12016_env_dy37_megafauna
COMRACRFC12015 Env Template Meiofaun comracrfc12015_env_meiofauna
COMRACFRC120015 Env Template 2015 demersal scavenger comracfrc120015_env_2015_demersal_scavenger
COMRACRFC12015 Env Template MacrFaun comracrfc12015_env_macrofauna
COMRACFRC120015 Env Template 2015 macrofauna comracfrc120015_env_2015_macrofauna
KOREAPMN12010 Env Template 2010 abundance koreapmn12010_env_2010_abundance
KOREAPMN12010 Env Template 2010 biomass koreapmn12010_env_2010_biomass
KOREAPMN12011 Env Template 2011 biomass koreapmn12011_env_2011_biomass
KOREAPMN12012 Env Template 2012 abundance koreapmn12012_env_2012_abundance
KOREAPMN12014 Env Template 2014 abundance koreapmn12014_env_2014_abundance
KOREAPMN12014 Env Template 2014 biomass koreapmn12014_env_2014_biomass
KOREAPMN12013 Env Template 2013 abundance koreapmn12013_env_2013_abundance
COMRACRFC12017 Env Template DY36 meiofauna comracrfc12017_env_dy36_meiofauna
OMSPMN12017 Macro Senckenberg omspmn12017_macro_senckenberg
UKSRLPMN12017 NOC NERC uksrlpmn12017_noc_nerc
OMSPMN12018 Env NHM NORCE omspmn12018_env_nhm_norce
UKSRLPMN12016 AB02 NOCS uksrlpmn12016_ab02_nocs
COMRAPMN12016 Sequences comrapmn12016_sequences
COMRAPMS12015 Env Template Meiobent comrapms12015_env_meiobenthos
OMSPMN12018 NUS Data omspmn12018_nus_data
COMRAPMS12018 Phytoplankton comrapms12018_phytoplankton
COMRAPMS12015 ENV comrapms12015_env
IFREMERPMN12018 SO239 ifremerpmn12018_so239
UKSRLPMN12015 Env Template Macrofauna 032016 uksrlpmn12015_env_macrofauna_032016
UKSRLPMN12015 Env Template Senckenberg 032016 uksrlpmn12015_env_senckenberg_032016
UKSRLPMN12015 Env Template GG 032020163 uksrlpmn12015_env_gg_032020163
UKSRLPMN12015 Env Template AB01 NHM uksrlpmn12015_env_ab01_nhm
UKSRLPMN12015 Env Templaye Megafauna 032016 uksrlpmn12015_env_megafauna_032016
UKSRLPMN12017 Senkenberg Meofauna uksrlpmn12017_senkenberg_meiofauna
COMRAPMN12014 Env Template W1101 comrapmn12014_env_w1101
COMRAPMN12014 Env Template W1102 comrapmn12014_env_w1102
COMRAPMN12014 Env Template WS1102 comrapmn12014_env_ws1102
COMRAPMN12014 Env Template WS1104 comrapmn12014_env_ws1104
IFREMERPMN12017 Env Template BIO1 2017 ifremerpmn12017_env_bio1_2017
COMRACRFC12017 Env Template DY29 zooplankton comracrfc12017_env_dy29_zooplankton

Extracting occurrence data

Let’s first create a new ID column, this will be used later to link together the measurements and occurrences, and to select records by dataset. We cannot use occurrenceID here because these are not unique within the dataset.

library(uuid)

records$id <- UUIDgenerate(use.time = NA, n = nrow(records))
stopifnot(length(unique(records$id)) == nrow(records))

Now we can select and process the columns that will go into the occurrence table.

extract_occurrences <- function(df) {
    df %>%
      select("id", "dataset_id", "Occurrence", "Event", "Location", "Identification", "Record-level", "Taxon") %>%
      jsonlite::flatten() %>%
      rename_all(~str_replace(., ".*\\.", "")) %>%
      as_tibble()
}

occ <- extract_occurrences(records)

Initial cleanup of occurrence data

First clean up any escaped newlines, empty strings, and placeholder values. Also fix basisOfRecord and convert coordinates to numeric values:

library(stringr)

occ <- occ %>%
  mutate_all(~gsub("\\n", "", .)) %>%
  mutate_all(~na_if(., "")) %>%
  mutate(across(where(is.character), str_squish)) %>%
  mutate_all(~replace(., . %in% c("indet", "Not Reported", "indet."), NA)) %>%
  mutate(basisOfRecord = "HumanObservation") %>%
  mutate(
    decimalLongitude = as.numeric(decimalLongitude),
    decimalLatitude = as.numeric(decimalLatitude)
  )

Let’s check for coordinates issues:

robis::map_ggplot(occ)

Let’s take a look at scientificName and scientificNameID.

occ %>%
  group_by(scientificName) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  rmarkdown::paged_table()
occ %>%
  group_by(scientificNameID) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  rmarkdown::paged_table()

So at least in the current version at the time of writing (June 2021) there are some quality issues for scientificName.

Fixing taxonomy

Let’s try to clean up the scientific names before we do taxon matching with WoRMS. Here I’m using the gni_parse() function from the taxize package, which connects to the GNI name parser. If a name cannot be parsed, I’m keeping the original.

The first step is to create a list of all distinct names in the taxonomy columns.

taxonomy <- occ %>%
  select(phylum, class, order, family, genus, scientificName)
names <- na.omit(unique(unlist(taxonomy)))

Then pass through the name parser:

library(taxize)

clean_name <- function(name) {
  parsed <- tryCatch({
    res <- gni_parse(name)
    stopifnot(nrow(res) == 1)
    return(res$canonical[1])
  },
  error = function(cond){
    return(name)
  })
}

names_clean <- sapply(names, clean_name)  

Now use the cleaned names for taxon matching:

library(worrms)

match_name <- function(name) {
  lsid <- tryCatch({
    res <- wm_records_names(name)
    matches <- res[[1]] %>%
      filter(match_type == "exact" | match_type == "exact_genus")
    if (nrow(matches) > 1) {
      message(paste0("Multiple exact matches for ", name))
    }
    return(matches$lsid[1])
  }, error = function(cond) {
    return(NA)
  })
}

lsids <- sapply(names_clean, match_name)

Now we need to find the lowest taxonomic level at which we find a name. Note that this will result in records with less taxonomic resolution than intended. Ideally we would only match on scientificName. First translate the taxonomy columns to LSIDs:

taxonomy_clean <- taxonomy %>%
  mutate_all(~names_clean[.]) %>%
  mutate_all(~lsids[.])

taxonomy_clean
## # A tibble: 47,372 × 6
##    phylum         class          order         family genus scientificName      
##    <chr>          <chr>          <chr>         <chr>  <chr> <chr>               
##  1 urn:lsid:mari… urn:lsid:mari… <NA>          <NA>   <NA>  urn:lsid:marinespec…
##  2 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA>   <NA>  urn:lsid:marinespec…
##  3 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA>   <NA>  urn:lsid:marinespec…
##  4 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA>   <NA>  urn:lsid:marinespec…
##  5 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA>   <NA>  urn:lsid:marinespec…
##  6 urn:lsid:mari… <NA>           <NA>          <NA>   <NA>  urn:lsid:marinespec…
##  7 urn:lsid:mari… urn:lsid:mari… <NA>          <NA>   <NA>  urn:lsid:marinespec…
##  8 urn:lsid:mari… urn:lsid:mari… <NA>          <NA>   <NA>  urn:lsid:marinespec…
##  9 urn:lsid:mari… urn:lsid:mari… <NA>          <NA>   <NA>  urn:lsid:marinespec…
## 10 <NA>           <NA>           <NA>          <NA>   <NA>  <NA>                
## # … with 47,362 more rows

The find the most specific one for each row:

taxonomy_clean <- taxonomy_clean %>%
  mutate(best = coalesce(scientificName, genus, family, order, class))

I’ll use the resulting LSIDs to replace the provided scientificNameIDs.

occ$scientificNameID <- taxonomy_clean$best

Let’s take another look at the top scientificName and scientificNameID after mathing:

occ %>%
  group_by(scientificName, scientificNameID) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  head(30) %>%
  knitr::kable()
scientificName scientificNameID records
polychaeta urn:lsid:marinespecies.org:taxname:883 2844
hymenopenaeus nereus urn:lsid:marinespecies.org:taxname:377453 2758
isopoda urn:lsid:marinespecies.org:taxname:1131 2163
plesiopenaeus armatus urn:lsid:marinespecies.org:taxname:107085 2003
coryphaenoides armatus or yaquinae urn:lsid:marinespecies.org:taxname:125748 1590
amphipoda urn:lsid:marinespecies.org:taxname:1135 1562
plesiodiadema urn:lsid:marinespecies.org:taxname:123393 1313
malacostraca urn:lsid:marinespecies.org:taxname:1071 1015
nematoda urn:lsid:marinespecies.org:taxname:799 969
monothalamea urn:lsid:marinespecies.org:taxname:744106 965
tanaidacea urn:lsid:marinespecies.org:taxname:1133 901
harpacticoida urn:lsid:marinespecies.org:taxname:1102 852
ostracoda urn:lsid:marinespecies.org:taxname:1078 829
ophiosphalma glabrum urn:lsid:marinespecies.org:taxname:244923 560
pachycara bulbiceps urn:lsid:marinespecies.org:taxname:127121 479
thalassomonhystera urn:lsid:marinespecies.org:taxname:2448 438
bivalvia urn:lsid:marinespecies.org:taxname:105 423
copepoda urn:lsid:marinespecies.org:taxname:1080 349
calanoida urn:lsid:marinespecies.org:taxname:1100 345
rimicaris rimicaris exoculata urn:lsid:marinespecies.org:taxname:107001 342
spionidae urn:lsid:marinespecies.org:taxname:913 327
eurythenes gryllus urn:lsid:marinespecies.org:taxname:102563 324
annelida urn:lsid:marinespecies.org:taxname:882 305
ophiuroidea urn:lsid:marinespecies.org:taxname:123084 287
tantulocarida urn:lsid:marinespecies.org:taxname:1083 284
gastropoda urn:lsid:marinespecies.org:taxname:101 270
tardigrada urn:lsid:marinespecies.org:taxname:1276 264
loricifera urn:lsid:marinespecies.org:taxname:101061 262
chromadoridae urn:lsid:marinespecies.org:taxname:2162 259
cumacea urn:lsid:marinespecies.org:taxname:1137 233

Extracting MeasurementOrFact data

extract_mof <- function(df) {
    df %>%
      select("id", "dataset_id", "MeasurementOrFact") %>%
      jsonlite::flatten() %>%
      rename_all(~str_replace(., ".*\\.", "")) %>%
      mutate(across(where(is.character), str_squish)) %>%
      mutate_all(~na_if(., "")) %>%
      filter(!is.na(measurementType) & !is.na(measurementValue)) %>%
      as_tibble()
}

mof <- extract_mof(records)
mof
## # A tibble: 12,377 × 6
##    id             dataset_id     measurementID  measurementType measurementValue
##    <chr>          <chr>          <chr>          <chr>           <chr>           
##  1 2d69af50-e297… bgrpmn12015_b… MUC20401639879 Relative abund… 11.9718309859155
##  2 5e86f2d4-bb8f… bgrpmn12015_b… MUC20411639880 Relative abund… 5.63380281690141
##  3 c615232e-467b… bgrpmn12015_b… MUC20421639881 Relative abund… 15.4929577464789
##  4 3238b022-f96c… bgrpmn12015_b… MUC20431639882 Relative abund… 33.0985915492958
##  5 e4f14773-37a4… bgrpmn12015_b… MUC20441639883 Relative abund… 10.5633802816901
##  6 b48d98b6-5827… bgrpmn12015_b… MUC20451639884 Relative abund… 11.9718309859155
##  7 c753b854-512e… bgrpmn12015_b… MUC20461639885 Relative abund… 5.63380281690141
##  8 84afdc08-b9e7… bgrpmn12015_b… MUC20471639886 Relative abund… 0.7042253521126…
##  9 d2d09102-809e… bgrpmn12015_b… MUC20481639887 Relative abund… 0.7042253521126…
## 10 1db232d4-90b2… bgrpmn12015_b… MUC20491639888 Relative abund… 2.8169014084507 
## # … with 12,367 more rows, and 1 more variable: measurementUnit <chr>

A number of records appear to have empty values. To demonstrate this, let’s take a look at the most common combinations of measurementType and measurementValue:

mof %>%
  group_by(measurementType, measurementValue) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  head(10) %>%
  knitr::kable()
measurementType measurementValue records
Relative abundance ns 4058
Relative abundance 0 452
Relative abundance in progress 141
Relative abundance 0.09765625 78
Relative abundance 0.282485875706215 72
Relative abundance 6.25 58
Relative abundance 0.25 56
Relative abundance 0.26525198938992 56
Relative abundance 7.69230769230769 53
Relative abundance 20 52

Generating Darwin Core Archives

Generating EML

For demonstration purposes, I’m working with the dataset pertaining to the first record here. The EML template is read from templates/eml.xml:

library(readr)
library(glue)

generate_eml <- function(df) {
  eml <- read_file("templates/eml.xml")
  metadata <- df$Metadata[1,]

  firstname <- strsplit(metadata$Creator$name, " ")[[1]][1]
  lastname <- strsplit(metadata$Creator$name, " ")[[1]][2]
  organization <- metadata$Creator$organisation
  email <- metadata$Creator$email
  position <- metadata$Creator$position

  creator_firstname <- ""
  creator_lastname <- ""
  creator_organization <- metadata$Contact$organisation
  creator_email <- ""
  creator_position <- ""

  abstract <- metadata$abstract
  title <- metadata$title
  citation <- metadata$citation
  packageid <- "https://datasets.obis.org/deepdata"
  pubdate <- format(Sys.time(), "%Y-%m-%d")
  datestamp <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
  glue(eml)
}

generate_eml(records)
## <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1"
##   xmlns:dc="http://purl.org/dc/terms/"
##   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
##   xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
##   packageId="https://datasets.obis.org/deepdata" system="http://gbif.org" scope="system"
##   xml:lang="eng">
## 
## <dataset>
##   <title xml:lang="eng">BGRPMN12015 BIODIVERSITY</title>
##   <pubDate>2021-11-24</pubDate>
##   <language>eng</language>
##   <abstract>
##     <para>Sampling data captured in Oceanic Exploration Research Based mainly on nodule abundances in 221 box core samples and the hydro-acoustic data obtained during seven cruises to the German license area, resource modelling using artificial neural network statistics was carried out for the entire license area of 75,000 km2 size, and in particular for four prospective potential mining areas in the eastern license area with a total size of 1038 km2. According to these assessments, the entire license area contains 620 million metric tonnes of nodules (dry weight) with 195 million t of Mn, 8.7 million t of Ni, 7.3 million t of Cu, 1.6 million t of Ti, and 1.1 million t of Co. Results
## of geostatistical analyses show that the mean nodule abundance in the four prospective areas in the eastern license area (“sub-clusters”) varies between 17.7 and 23.1 kg/m2 and the resources here amount to 14 million metric tonnes of nodules in total (dry weight), containing 4.4 million t of Mn, 198,000 t of Ni, 165,000 t of Cu, 36,000 t of Ti, and 24,000 t of Co.</para>
##   </abstract>
##   <keywordSet>
##     <keyword>Occurrence</keyword>
##     <keywordThesaurus>GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type.xml</keywordThesaurus>
##   </keywordSet>
##   <intellectualRights>
##     <para>This work is licensed under a <ulink url="http://creativecommons.org/licenses/by/4.0/legalcode"><citetitle>Creative Commons Attribution (CC-BY) 4.0 License</citetitle></ulink>.</para>
##   </intellectualRights>
##   <maintenance>
##     <description>
##       <para></para>
##     </description>
##     <maintenanceUpdateFrequency>unkown</maintenanceUpdateFrequency>
##   </maintenance>
##   <creator>
##     <individualName>
##     <givenName></givenName>
##     <surName></surName>
##     </individualName>
##     <organizationName>Federal Institute for Geosciences and Natural Resources of Germany</organizationName>
##     <positionName></positionName>
##     <electronicMailAddress></electronicMailAddress>
##   </creator>
##   <metadataProvider>
##     <individualName>
##     <givenName>Sheldon</givenName>
##     <surName>Carter</surName>
##     </individualName>
##     <organizationName>International Seabed Authority</organizationName>
##     <positionName>Database Manager</positionName>
##     <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
##   </metadataProvider>
##   <contact>
##     <individualName>
##     <givenName>Sheldon</givenName>
##     <surName>Carter</surName>
##     </individualName>
##     <organizationName>International Seabed Authority</organizationName>
##     <positionName>Database Manager</positionName>
##     <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
##   </contact>
## </dataset>
## <additionalMetadata>
## <metadata>
## <gbif>
##   <dateStamp>2021-11-24T21:11:29+0100</dateStamp>
##   <hierarchyLevel>dataset</hierarchyLevel>
##   <citation>Federal Institute for Geosciences and Natural Resources of Germany, (2015). BGRPMN12015 BIODIVERSITY. Available : DeepData, International Seabed Authority https://data.isa.org.jm/ Accessed: [YYYY-MM-DD].</citation>
## </gbif>
## </metadata>
## </additionalMetadata>
## </eml:eml>

Generating an archive descriptor file

The archive also needs to include a meta.xml file which describes the files in the archive and their relationships.

Let’s first get a list of terms including their qualName.

library(xml2)

get_terms <- function(url) {
  doc <- read_xml(url)
  terms <- doc %>%
    xml_ns_strip() %>%
    xml_find_all(".//property") %>% 
    map_df(function(x) {
      list(
        name = xml_attr(x, "name"),
        qual = xml_attr(x, "qualName")
      )
    })
}

occurrence_terms <- get_terms("https://rs.gbif.org/core/dwc_occurrence_2020-07-15.xml")
mof_terms <- get_terms("https://rs.gbif.org/extension/obis/extended_measurement_or_fact.xml")

Using these we can generate a list of terms to go into the meta.xml file for each table.

generate_meta <- function(occ, mof) {
  occurrence_fields <- tibble(name = names(occ)) %>%
    left_join(occurrence_terms, by = "name") %>%
    mutate(index = as.numeric(row.names(.)) - 1)
  
  occurrence_lines <- paste0("<field index=\"", occurrence_fields$index, "\" term=\"", occurrence_fields$qual, "\"/>")
  occurrence_lines[1] <- "<id index=\"0\" />"
  occurrence_lines <- paste0(occurrence_lines, collapse = "\n")

  mof_fields <- tibble(name = names(mof)) %>%
  left_join(mof_terms, by = "name") %>%
  mutate(index = as.numeric(row.names(.)) - 1)

  mof_lines <- paste0("<field index=\"", mof_fields$index, "\" term=\"", mof_fields$qual, "\"/>")
  mof_lines[1] <- "<coreid index=\"0\" />"
  mof_lines <- paste0(mof_lines, collapse = "\n")

  meta <- read_file("templates/meta.xml")
  glue(meta)
}

generate_meta(occ, mof)
## <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
##   <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
##     <files>
##       <location>occurrence.txt</location>
##     </files>
##     <id index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/individualCount"/>
## <field index="6" term="http://rs.tdwg.org/dwc/terms/organismQuantity"/>
## <field index="7" term="http://rs.tdwg.org/dwc/terms/organismQuantityType"/>
## <field index="8" term="http://rs.tdwg.org/dwc/terms/sex"/>
## <field index="9" term="http://rs.tdwg.org/dwc/terms/occurrenceStatus"/>
## <field index="10" term="http://rs.tdwg.org/dwc/terms/associatedSequences"/>
## <field index="11" term="http://rs.tdwg.org/dwc/terms/occurrenceRemarks"/>
## <field index="12" term="http://rs.tdwg.org/dwc/terms/eventID"/>
## <field index="13" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
## <field index="14" term="http://rs.tdwg.org/dwc/terms/eventTime"/>
## <field index="15" term="http://rs.tdwg.org/dwc/terms/year"/>
## <field index="16" term="http://rs.tdwg.org/dwc/terms/month"/>
## <field index="17" term="http://rs.tdwg.org/dwc/terms/day"/>
## <field index="18" term="http://rs.tdwg.org/dwc/terms/habitat"/>
## <field index="19" term="http://rs.tdwg.org/dwc/terms/samplingProtocol"/>
## <field index="20" term="http://rs.tdwg.org/dwc/terms/eventRemarks"/>
## <field index="21" term="http://rs.tdwg.org/dwc/terms/locationID"/>
## <field index="22" term="http://rs.tdwg.org/dwc/terms/minimumDepthInMeters"/>
## <field index="23" term="http://rs.tdwg.org/dwc/terms/maximumDepthInMeters"/>
## <field index="24" term="http://rs.tdwg.org/dwc/terms/verbatimDepth"/>
## <field index="25" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
## <field index="26" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
## <field index="27" term="http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem"/>
## <field index="28" term="http://rs.tdwg.org/dwc/terms/verbatimSRS"/>
## <field index="29" term="http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters"/>
## <field index="30" term="http://rs.tdwg.org/dwc/terms/identificationID"/>
## <field index="31" term="http://rs.tdwg.org/dwc/terms/typeStatus"/>
## <field index="32" term="http://rs.tdwg.org/dwc/terms/identifiedBy"/>
## <field index="33" term="http://rs.tdwg.org/dwc/terms/dateIdentified"/>
## <field index="34" term="http://rs.tdwg.org/dwc/terms/identificationVerificationStatus"/>
## <field index="35" term="http://purl.org/dc/terms/type"/>
## <field index="36" term="http://purl.org/dc/terms/license"/>
## <field index="37" term="http://purl.org/dc/terms/rightsHolder"/>
## <field index="38" term="http://purl.org/dc/terms/accessRights"/>
## <field index="39" term="http://purl.org/dc/terms/bibliographicCitation"/>
## <field index="40" term="http://rs.tdwg.org/dwc/terms/institutionID"/>
## <field index="41" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
## <field index="42" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
## <field index="43" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
## <field index="44" term="http://rs.tdwg.org/dwc/terms/scientificNameID"/>
## <field index="45" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
## <field index="46" term="http://rs.tdwg.org/dwc/terms/phylum"/>
## <field index="47" term="http://rs.tdwg.org/dwc/terms/class"/>
## <field index="48" term="http://rs.tdwg.org/dwc/terms/order"/>
## <field index="49" term="http://rs.tdwg.org/dwc/terms/family"/>
## <field index="50" term="http://rs.tdwg.org/dwc/terms/genus"/>
## <field index="51" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
## <field index="52" term="http://rs.tdwg.org/dwc/terms/taxonomicStatus"/>
## <field index="53" term="http://rs.tdwg.org/dwc/terms/taxonRemarks"/>
##   </core>
##   <extension encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact">
##     <files>
##       <location>extendedmeasurementorfact.txt</location>
##     </files>
##     <coreid index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/measurementID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/measurementType"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/measurementValue"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/measurementUnit"/>
##   </extension>
## </archive>

Bringing it all together

Now we can generate an archive for each dataset. While I’m generating datasets I’m also populating the RSS feed and creating dataset landing pages.

baseurl <- "https://datasets.obis.org/hosted/isa/"
item_template <- read_file("templates/rss_item.xml")
landing_template <- read_file("templates/index_dataset.html")
items <- list()
pubdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %z")

unlink("output", recursive = TRUE)
dir.create("output")

datasetids <- unique(records$dataset_id)

for (datasetid in datasetids) {
  
  dataset <- records %>%
    filter(dataset_id == datasetid) %>%
    head(1)

  dataset$Metadata$abstract <- dataset$Metadata$abstract %>%
    str_replace(., "&", "&amp;") %>%
    str_replace(., ">", "&gt;") %>%
    str_replace(., "<", "&lt;") %>%
    str_replace(., "'", "&apos;") %>%
    str_replace(., "\"", "&quot;")

  title <- dataset$Metadata$title
  abstract <- dataset$Metadata$abstract
  link <- paste0(baseurl, datasetid, "/index.html")
  dwca <- paste0(baseurl, datasetid, "/", datasetid, ".zip")

  # clear dataset directory
    
  unlink(paste0("output/", datasetid), recursive = TRUE)
  dir.create(paste0("output/", datasetid))

  # RSS feed items
  
  item <- glue(item_template)
  items[[datasetid]] <- item
  
  # dataset landing page
  
  landing <- glue(landing_template)
  writeLines(landing, paste0("output/", datasetid, "/index.html"))
  
  # archive  
  
  dataset_occ <- occ %>% filter(dataset_id == datasetid) 
  dataset_mof <- mof %>% filter(dataset_id == datasetid) 

  eml <- generate_eml(dataset)
  meta <- generate_meta(occ, mof)
  
  write.table(dataset_occ, file = paste0("output/", datasetid, "/occurrence.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
  write.table(dataset_mof, file = paste0("output/", datasetid, "/extendedmeasurementorfact.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
  writeLines(eml, paste0("output/", datasetid, "/eml.xml"))
  writeLines(meta, paste0("output/", datasetid, "/meta.xml"))
  
  files <- c("occurrence.txt", "extendedmeasurementorfact.txt", "eml.xml", "meta.xml")
  setwd(paste0("output/", datasetid))
  zip(glue("{datasetid}.zip"), files)
  for (f in files) {
    file.remove(f)
  }
  setwd("../..")

}

Data publishing

In this section all files are uploaded to an S3 bucket. A list of datasets is visible at https://datasets.obis.org/hosted/isa/index.html, and an RSS file is available for the OBIS harvester.

Generate RSS file

items <- paste0(items, collapse = "\n")
rss_template <- read_file("templates/rss.xml")

title <- "International Seabed Authority (ISA)"
description <- "International Seabed Authority (ISA)"
link <- paste0(baseurl, "index.html")

rss <- glue(rss_template)
writeLines(rss, "output/rss.xml")

Generate landing page

index_template <- read_file("templates/index.html")
content <- paste0(paste0("<li><a href=\"", datasetids, "/index.html\">", datasetids, "</a></li>"), collapse = "\n")
index <- glue(index_template)
writeLines(index, "output/index.html")

Uploading to S3

delete_object("hosted/isa/", bucket = "obis-datasets")
files <- list.files("output", full.names = TRUE, recursive = TRUE, include.dirs = FALSE)

for (file in files) {
  folder <- str_replace(dirname(file), "output", "hosted/isa")
  target <- str_replace(file, "output", "hosted/isa")
  message(target)
  put_object(file, object = target, bucket = "obis-datasets", acl = "public-read")
}